#' ---
#' title: "Agenda Seeding: Dynamics of Collective Action Data Prep"
#' date: "`r Sys.Date()`"
#' output: html_document
#' header-includes:
#'  - \usepackage{booktabs}
#'  - \usepackage{longtable}
#'  - \usepackage{array}
#'  - \usepackage{multirow}
#'  - \usepackage{wrapfig}
#'  - \usepackage{float}
#'  - \usepackage{colortbl}
#'  - \usepackage{pdflscape}
#'  - \usepackage{tabu}
#'  - \usepackage{threeparttable}
#'  - \usepackage{threeparttablex}
#'  - \usepackage[normalem]{ulem}
#'  - \usepackage{makecell}
#'  - \usepackage{dcolumn}
#'  - \usepackage{setspace}\doublespacing
#' ---



## ---- dca_prep_spin_code, eval = FALSE, include = FALSE ----
# spin code to output Rmd / Rnw
# set output_format to "html_document" for html
# rmarkdown::render(input = here::here("code/dca_protest_prep3.R"), output_format = "pdf_document", clean = TRUE)


## ---- dca_prep_setup1, include = FALSE ----

library(janitor)
library(dplyr)
library(here)
library(knitr)


fiver <- function(x) {
    str_pad(
        string = x,
        width = 5,
        side = "left",
        pad = "0"
    )
}


## ---- dca_load_data, include = FALSE ----

dca_geocodes <- read_csv(file = here("data/dca_city_state_geocoded.csv"))

dim(dca_geocodes)

dca_orig <- foreign::read.dta(file = here("data/dynamics_of_collective_action/final_data_v10.dta"))

dca_geocoded <- left_join(dca_orig, 
                 dca_geocodes %>% select(eventid, event_date, mploc, LAT, LON),
                 by = "eventid")


## ---- recode_dca_data, include = FALSE ----

# create subsetted geocoded data
dca_geo <- dca_geocoded %>%
    dplyr::filter(igrp1c1 %in% 401,               # black protesters
                  event_date >= "1960-01-01" & event_date <= "1972-12-31") 


dim(dca_geo)

# participation estimates
dca_geo$part2 <- dca_geo$partices

# impute from other participation estimate if missingness
miss <- which(is.na(dca_geo$partices))

if(length(miss) > 0) {
    dca_geo$part2[miss] <-
    car::recode(dca_geo$particex[miss],
           "1:9=1; 10:49=2; 50:99=3; 100:999=4; 1000:9999=5; 10000:10000000=6")
}

# build combined estimate
dca_geo$part3 <-
    car::recode(dca_geo$part2, "1=1; 2=10; 3=50; 4=100; 5=1000; 6=10000")

notmiss <- which(!is.na(dca_geo$particex))

if(length(notmiss) > 0) {
    dca_geo$part3[notmiss] <- dca_geo$particex[notmiss]
}

outlier <-
    # particex school boycott = students
    which(dca_geo$particex > 460000 & dca_geo$particex < 470000)  

# recode students to protest approx event size
# http://crdl.usg.edu/events/ny_school_boycott/
# "thousands of demonstrators staged peaceful rallies at the Board of Education, City Hall and the Manhattan office of Governor Nelson Rockefeller"

# https://www.nytimes.com/1964/02/04/archives/boycott-cripples-city-schools-absences-360000-above-normal-negroes.html
# NYT: pickets marching at 300 of the city's 860 public schools.
# NYT: 2,600 Marchers Show Up at Buildings‐Donovan Is Critical of Leaders 
# NYT: 3,500 demonstrators, mostly children, on Board of Education headquarters in Brooklyn.
dca_geo$part3[outlier] <- 10000


# recode protester violence variables
viold2 <- which(dca_geo$viold == 1 | dca_geo$propdam == 1 | 
                    dca_geo$deaths == 1 | dca_geo$injury == 1)

# violent protest dummy
dca_geo$v2         <- 0
dca_geo$v2[viold2] <- 1

# nonviolent protest dummy
dca_geo$nv         <- 1
dca_geo$nv[viold2] <- 0

# sum(dca$v2==0)
# sum(dca$nv==1)

# sum(dca$v2==1)
# sum(dca$nv==0)

# rename to save for backward compatibility
protest.data <- dca_geo

# save for use in other code
save(protest.data, file = here::here("data/DCA_protest_data.Rdata"))

## ---- subset_columns_drop_noncontinental ----
dca <- dca_geo %>%
    dplyr::select(eventid, state1, city1, LON, LAT, event_date, part2, part3, v2, nv) 


# remove noncontinental US, inlcude = FALSE
noncontinental <- grep("AK|HI|PR", dca$state1)
dca[noncontinental, ]

dca <- if(length(noncontinental) > 0 ) dca[-noncontinental, ]

# clean bad data entry
rickmond <- which(dca$city1 == "RICKMOND")
if(length(rickmond > 0 )) {dca$city1[rickmond] <- "RICHMOND"}

# drop non-geocoded rows
dca <- dca %>% filter(!is.na(LON))


## ---- dca_save_working_version ----

save(dca, file = here("data/dca_protest_data_black_geocoded.Rdata"))


## ---- load_county_demographic_electoral_data ----

load(here("data/voting_census_rain.Rdata"), verbose = TRUE)

names(vc2)



## ---- dca_set_global_protest_thresholds, include = FALSE ----

dist_thresh <- 100
time_thresh <- 365 * 2
part_thresh <- 10

# identify the election days of the 1964 - 1972 elections
elec64 <- as.Date("1964-11-03") ## 1964 Pres/Cong election
elec68 <- as.Date("1968-11-05") ## 1968 Pres/Cong election
elec72 <- as.Date("1972-11-07") ## 1972 Pres/Cong election

n <- nrow(vc2)/3

# create dummy indices for county-year rows
# first row = 64, second = 68, third = 72
d64 <- rep(c(TRUE,  FALSE, FALSE), n)
d68 <- rep(c(FALSE, TRUE,  FALSE), n)
d72 <- rep(c(FALSE, FALSE, TRUE),  n)

## ---- merge_dca_protest_data_distance, include = FALSE ----

# create county x protest distance matrix
dist_matrix <- fields::rdist.earth(
    vc2[, c("bg_long", "bg_lat")] %>% as.data.frame(), 
    dca[, c("LON", "LAT")]
    )

dim(dist_matrix)

# create simple distance threshold function
dist_func   <- function(x) {x <= dist_thresh}

# with distance matrix, 
dist_treat  <- map_df(.x = dist_matrix %>% as.data.frame(), 
                      .f = function(x) dist_func(x) )
#dim(dist_treat)




# create county x nonviolent protest distance matrix
dca_dist_matrix <- dist_matrix


dca_nv_dist_matrix <- rdist.earth(
    vc2[vc2$year == 1964, c("bg_long", "bg_lat")] %>% as.data.frame(), 
    dca[dca$nv   == 1,    c("LON", "LAT")]
)

# create county x violent protest distance matrix
dca_v_dist_matrix <- rdist.earth(
    vc2[vc2$year == 1964, c("bg_long", "bg_lat")] %>% as.data.frame(), 
    dca[dca$v2   == 1,    c("LON", "LAT")]
)


# save county-carter protest distance matrix
save(dca_dist_matrix, dca_nv_dist_matrix, dca_v_dist_matrix, file = here("data/dca_county_dist_matrix.Rdata"))


## ---- merge_dca_protest_data_time, include = FALSE ----

time_func <- function(x) {
    # calc whether event is between time_thresh days on lower bound 
    # and zero days (election day) on upper bound
    t64 <- -time_thresh <= (as.Date(x) - elec64) & (as.Date(x) - elec64) <= 0 
    t68 <- -time_thresh <= (as.Date(x) - elec68) & (as.Date(x) - elec68) <= 0
    t72 <- -time_thresh <= (as.Date(x) - elec72) & (as.Date(x) - elec72) <= 0 
    return(data.frame(t64, t68, t72)
    )
}

# across all carter event dats, calc time treatment
time_treat <- map_df(.x = dca[, "event_date", drop = FALSE], 
                     .f = function(x) time_func(x) )

#dim(time_treat)

## ---- merge_carter_protest_data_arrests, include = FALSE ----

# calc protest effect with matrix multiplication
dca_64_matrix <- (t(t(dist_treat) * ((dca$part3 * dca$nv) >= part_thresh) * time_treat[, "t64"]))
dca_68_matrix <- (t(t(dist_treat) * ((dca$part3 * dca$nv) >= part_thresh) * time_treat[, "t68"]))
dca_72_matrix <- (t(t(dist_treat) * ((dca$part3 * dca$nv) >= part_thresh) * time_treat[, "t72"]))

# protest sum
dca_64_nv_sum <- rowSums(dca_64_matrix, na.rm = TRUE) 
dca_68_nv_sum <- rowSums(dca_68_matrix, na.rm = TRUE) 
dca_72_nv_sum <- rowSums(dca_72_matrix, na.rm = TRUE) 

# dca protest binary
dca_64_nv_bin <- (dca_64_nv_sum > 0) %>% as.numeric()
dca_68_nv_bin <- (dca_68_nv_sum > 0) %>% as.numeric()
dca_72_nv_bin <- (dca_72_nv_sum > 0) %>% as.numeric()

# combine all years into single vector / column
dca_nv_bin      <- rep(NA, n * 3)
dca_nv_bin[d64] <- dca_64_nv_bin[d64]
dca_nv_bin[d68] <- dca_68_nv_bin[d68]
dca_nv_bin[d72] <- dca_72_nv_bin[d72]

vc2$dca_nv_bin <- dca_nv_bin


# calc protest effect with matrix multiplication
dca_64_matrix <- (t(t(dist_treat) * ((dca$part3 * dca$v2) >= part_thresh) * time_treat[, "t64"]))
dca_68_matrix <- (t(t(dist_treat) * ((dca$part3 * dca$v2) >= part_thresh) * time_treat[, "t68"]))
dca_72_matrix <- (t(t(dist_treat) * ((dca$part3 * dca$v2) >= part_thresh) * time_treat[, "t72"]))

# protest sum
dca_64_v_sum <- rowSums(dca_64_matrix, na.rm = TRUE) 
dca_68_v_sum <- rowSums(dca_68_matrix, na.rm = TRUE) 
dca_72_v_sum <- rowSums(dca_72_matrix, na.rm = TRUE) 

# dca protest binary
dca_64_v_bin <- (dca_64_v_sum > 0) %>% as.numeric()
dca_68_v_bin <- (dca_68_v_sum > 0) %>% as.numeric()
dca_72_v_bin <- (dca_72_v_sum > 0) %>% as.numeric()

dca_v_bin      <- rep(NA, n * 3)
dca_v_bin[d64] <- dca_64_v_bin[d64]
dca_v_bin[d68] <- dca_68_v_bin[d68]
dca_v_bin[d72] <- dca_72_v_bin[d72]

vc2$dca_v_bin <- dca_v_bin



## ---- save_data, eval = TRUE ----

save(vc2, file = here("data/voting_census_carter_rain_dca.Rdata"))


## ---- build_codebook, eval = FALSE ----

dataMaid::makeCodebook(
    vc2, 
    file    = here("codebooks/codebook_dca_protest_data_combined.Rmd"), 
    reportTitle = "DCA Protest, County Demographic and Electoral Data Combined",
    checks  = list(character = NULL, factor = NULL), 
    replace = TRUE
)

## ---- reset_star_format_for_interactive_use ----

star_format <- "text"